0. Install Packages
pkgs <- c("jiebaR", "tidyverse", "tidytext", "stringr", "e1071", "tidyr", "Rtsne")
pkgs <- pkgs[!pkgs %in% installed.packages()[,"Package"]]
if(length(pkgs)) { install.packages(pkgs)}
library(tidyverse)
library(stringr)
options(stringsAsFactors = F)
Loading data
library(jiebaR)
segment_not <- c("鴻海" , "永豐金", "中信金", "台積電", "聯發科" ,"兆豐金", "台指期","郭台銘","張忠謀","鉅亨網")
cutter <- worker()
new_user_word(cutter,segment_not)
stopWords <- readRDS("data/stopWords.rds")
Stopwords
unnested.df <- stock_news %>%
select(doc_id = newsId, text = content, status = status_p) %>%
mutate(word = purrr::map(text, function(x)segment(x, cutter))) %>%
unnest(word) %>%
filter(!is.na(word)) %>%
filter(!word %in% stopWords$word) %>%
filter(!str_detect(word, "[a-zA-Z0-9]+")) %>%
filter(nchar(word) > 1)
- original dimension: > 610 news x 12,936 words
unnested.df %>%
count(doc_id, word) %>%
spread(word, n, fill = 0) %>% dim
[1] 610 12936
4. Chi-square feature selection
chi_df <- unnested.df %>%
count(word, status) %>%
filter(n > 3) %>%
spread(status, n, fill = 0) %>%
rename(A=`1`, C=`0`) %>%
mutate(B=sum(A)-A,
D=sum(C)-C,
N=A+B+C+D,
chi2 = (A*D - B*C)^2 * N / ((A+C)*(A+B)*(B+D)*(C+D))) %>%
filter(chi2 > 6.64)
5. Counting doc term frequency after feature selection
doc_term_count <- unnested.df %>%
left_join(chi_df) %>%
filter(!is.na(chi2)) %>%
count(doc_id, word)
Joining, by = "word"
doc_term_count %>%
spread(word, n, fill = 0) %>%
dim
[1] 609 546
6. TF-IDF(term frequency & inverse document frequency)
# install.packages("tidytext")
# dtm <- cast_dtm(word_token, title, words, n)
# ??cast_dtm
comb.df <- doc_term_count %>%
tidytext::bind_tf_idf(word, doc_id, n) %>%
select(doc_id, word, tf_idf) %>%
spread(word, tf_idf, fill=0) %>%
left_join(select(stock_news, doc_id = newsId, status = status_p)) %>%
select(doc_id, status, everything())
Joining, by = "doc_id"
7. T-SNE
library(Rtsne) # cannot be installed in MacOS mojave
tsne <- comb.df %>% select(-doc_id, -status) %>%
Rtsne(perplexity = 35, dims = 2, check_duplicates = F)
# 取出降維後的特徵值df
feature_tsne <- comb.df %>%
select(doc_id, status) %>%
mutate(status = as.factor(status)) %>%
bind_cols(as.data.frame(tsne$Y)) %>%
mutate(id = row_number())
plotting tsne results
feature_tsne %>%
ggplot() + aes(V1, V2, color = status) +
geom_point()

8. divide to tranining and testing set
set.seed(2017)
samples <- sample(1:nrow(feature_tsne),
size = round(nrow(feature_tsne)*0.6))
trainset <- feature_tsne %>% select(-doc_id) %>% slice(samples)
testset <- feature_tsne[-samples,-1]
9. SVM
library(e1071)
model <- svm(status~ ., data = trainset, kernel="radial")
plot(model, trainset, V1~V2)

predicting <- predict(model, testset %>% select(-status))
# creating confusion matrix
# https://en.wikipedia.org/wiki/Confusion_matrix
table(predicting, testset$status)
predicting 0 1
0 126 61
1 30 27
# accuracy
pre <- predicting == testset$status
percent1 <- length(pre[pre == T]) / length(pre)
percent1
[1] 0.6270492
LS0tCnRpdGxlOiAiU1ZNICYgU3RvY2sgUHJpY2UgUHJlZGljdGlvbiIKYXV0aG9yOiAiSmlsdW5nIEhzaWVoIgpkYXRlOiAiMjAxOC83LzMiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazogCiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIGhpZ2hsaWdodDogdGV4dG1hdGUKICAgIHRoZW1lOiBzcGFjZWxhYgogICAgdG9jOiB5ZXMKZWRpdG9yX29wdGlvbnM6IAogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUKLS0tCgoKIyMgMC4gSW5zdGFsbCBQYWNrYWdlcwoKCmBgYHtyIHByZS1pbnN0YWxsfQpwa2dzIDwtIGMoImppZWJhUiIsICJ0aWR5dmVyc2UiLCAidGlkeXRleHQiLCAic3RyaW5nciIsICJlMTA3MSIsICJ0aWR5ciIsICJSdHNuZSIpCnBrZ3MgPC0gcGtnc1shcGtncyAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygpWywiUGFja2FnZSJdXQppZihsZW5ndGgocGtncykpIHsgaW5zdGFsbC5wYWNrYWdlcyhwa2dzKX0KCmBgYAoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShzdHJpbmdyKQpvcHRpb25zKHN0cmluZ3NBc0ZhY3RvcnMgPSBGKQpgYGAKCiMgTG9hZGluZyBkYXRhCmBgYHtyfQpsb2FkKCJkYXRhL3N0b2NrX25ld3MuUkRhdGEiKQpzdG9ja19uZXdzICU+JSBuYW1lcwpzdG9ja19uZXdzICU+JSBzZWxlY3QobmV3c0lkLCB3b3JkcywgdGltZSwgY29kZSwgc3RhdHVzX3AsIHN0YXR1c192LCBldmVyeXRoaW5nKCkpICU+JSBWaWV3CmBgYAoKCmBgYHtyIGplaWJhUiBhbmQgc3RvcCB3b3JkfQpsaWJyYXJ5KGppZWJhUikKc2VnbWVudF9ub3QgPC0gYygi6bS75rW3IiAsICAi5rC46LGQ6YeRIiwgIuS4reS/oemHkSIsICLlj7DnqY3pm7siLCAi6IGv55m856eRIiAsIuWFhuixkOmHkSIsICLlj7DmjIfmnJ8iLCLpg63lj7DpipgiLCLlvLXlv6DorIAiLCLpiYXkuqjntrIiKQpjdXR0ZXIgPC0gd29ya2VyKCkKbmV3X3VzZXJfd29yZChjdXR0ZXIsc2VnbWVudF9ub3QpCnN0b3BXb3JkcyA8LSByZWFkUkRTKCJkYXRhL3N0b3BXb3Jkcy5yZHMiKQpgYGAKCgoKCiMgU3RvcHdvcmRzCgoKYGBge3J9Cgp1bm5lc3RlZC5kZiA8LSBzdG9ja19uZXdzICU+JQogICAgc2VsZWN0KGRvY19pZCA9IG5ld3NJZCwgdGV4dCA9IGNvbnRlbnQsIHN0YXR1cyA9IHN0YXR1c19wKSAlPiUKICAgIG11dGF0ZSh3b3JkID0gcHVycnI6Om1hcCh0ZXh0LCBmdW5jdGlvbih4KXNlZ21lbnQoeCwgY3V0dGVyKSkpICU+JQogICAgdW5uZXN0KHdvcmQpICU+JQogICAgZmlsdGVyKCFpcy5uYSh3b3JkKSkgJT4lIAogICAgZmlsdGVyKCF3b3JkICVpbiUgc3RvcFdvcmRzJHdvcmQpICU+JQogICAgZmlsdGVyKCFzdHJfZGV0ZWN0KHdvcmQsICJbYS16QS1aMC05XSsiKSkgJT4lCiAgICBmaWx0ZXIobmNoYXIod29yZCkgPiAxKQpgYGAKCgoqIG9yaWdpbmFsIGRpbWVuc2lvbjogCj4gNjEwIG5ld3MgeCAxMiw5MzYgd29yZHMKCmBgYHtyfQp1bm5lc3RlZC5kZiAlPiUKICAgIGNvdW50KGRvY19pZCwgd29yZCkgJT4lCiAgICBzcHJlYWQod29yZCwgbiwgZmlsbCA9IDApICU+JSBkaW0Kc2VsZWN0KDE6MjApICU+JSBoZWFkKDEwMCkgJT4lIFZpZXcKYGBgCgoKCiMgNC4gQ2hpLXNxdWFyZSBmZWF0dXJlIHNlbGVjdGlvbgoKYGBge3J9CmNoaV9kZiA8LSB1bm5lc3RlZC5kZiAlPiUKICAgIGNvdW50KHdvcmQsIHN0YXR1cykgJT4lCiAgICBmaWx0ZXIobiA+IDMpICU+JQogICAgc3ByZWFkKHN0YXR1cywgbiwgZmlsbCA9IDApICU+JQogICAgcmVuYW1lKEE9YDFgLCBDPWAwYCkgJT4lCiAgICBtdXRhdGUoQj1zdW0oQSktQSwKICAgICAgICAgICBEPXN1bShDKS1DLAogICAgICAgICAgIE49QStCK0MrRCwgCiAgICAgICAgICAgY2hpMiA9IChBKkQgLSBCKkMpXjIgKiBOIC8gKChBK0MpKihBK0IpKihCK0QpKihDK0QpKSkgJT4lCiAgICBmaWx0ZXIoY2hpMiA+IDYuNjQpCmBgYAoKCiMgNS4gQ291bnRpbmcgZG9jIHRlcm0gZnJlcXVlbmN5IGFmdGVyIGZlYXR1cmUgc2VsZWN0aW9uCmBgYHtyfQpkb2NfdGVybV9jb3VudCA8LSB1bm5lc3RlZC5kZiAlPiUKICAgIGxlZnRfam9pbihjaGlfZGYpICU+JQogICAgZmlsdGVyKCFpcy5uYShjaGkyKSkgJT4lCiAgICBjb3VudChkb2NfaWQsIHdvcmQpCmBgYAoKYGBge3J9CmRvY190ZXJtX2NvdW50ICU+JQogICAgc3ByZWFkKHdvcmQsIG4sIGZpbGwgPSAwKSAlPiUKICAgIGRpbQpgYGAKCgoKCgojIDYuIFRGLUlERu+8iHRlcm0gZnJlcXVlbmN5ICYgaW52ZXJzZSBkb2N1bWVudCBmcmVxdWVuY3nvvIkKCmBgYHtyfQpsaWJyYXJ5KHRpZHl0ZXh0KQojIGluc3RhbGwucGFja2FnZXMoInRpZHl0ZXh0IikKIyBkdG0gPC0gY2FzdF9kdG0od29yZF90b2tlbiwgdGl0bGUsIHdvcmRzLCBuKQojID8/Y2FzdF9kdG0KY29tYi5kZiA8LSBkb2NfdGVybV9jb3VudCAlPiUKICAgIHRpZHl0ZXh0OjpiaW5kX3RmX2lkZih3b3JkLCBkb2NfaWQsIG4pICU+JQogICAgc2VsZWN0KGRvY19pZCwgd29yZCwgdGZfaWRmKSAlPiUKICAgIHNwcmVhZCh3b3JkLCB0Zl9pZGYsIGZpbGw9MCkgJT4lCiAgICBsZWZ0X2pvaW4oc2VsZWN0KHN0b2NrX25ld3MsIGRvY19pZCA9IG5ld3NJZCwgc3RhdHVzID0gc3RhdHVzX3ApKSAlPiUKICAgIHNlbGVjdChkb2NfaWQsIHN0YXR1cywgZXZlcnl0aGluZygpKQpgYGAKCgoKCiMgNy4gVC1TTkUKCgpgYGB7cn0KbGlicmFyeShSdHNuZSkgCgp0c25lIDwtIGNvbWIuZGYgJT4lIHNlbGVjdCgtZG9jX2lkLCAtc3RhdHVzKSAlPiUKICAgIFJ0c25lKHBlcnBsZXhpdHkgPSAzNSwgZGltcyA9IDIsIGNoZWNrX2R1cGxpY2F0ZXMgPSBGKQoKZmVhdHVyZV90c25lIDwtIGNvbWIuZGYgJT4lCiAgICBzZWxlY3QoZG9jX2lkLCBzdGF0dXMpICU+JQogICAgbXV0YXRlKHN0YXR1cyA9IGFzLmZhY3RvcihzdGF0dXMpKSAlPiUKICAgIGJpbmRfY29scyhhcy5kYXRhLmZyYW1lKHRzbmUkWSkpICU+JQogICAgbXV0YXRlKGlkID0gcm93X251bWJlcigpKQpgYGAKCiMgcGxvdHRpbmcgdHNuZSByZXN1bHRzCgpgYGB7cn0KZmVhdHVyZV90c25lICU+JQogICAgZ2dwbG90KCkgKyBhZXMoVjEsIFYyLCBjb2xvciA9IHN0YXR1cykgKyAKICAgIGdlb21fcG9pbnQoKQpgYGAKCgoKCgojIDguIGRpdmlkZSB0byB0cmFuaW5pbmcgYW5kIHRlc3Rpbmcgc2V0CmBgYHtyfQpzZXQuc2VlZCgyMDE3KQoKc2FtcGxlcyA8LSBzYW1wbGUoMTpucm93KGZlYXR1cmVfdHNuZSksIAogICAgICAgICAgICAgICAgICBzaXplID0gcm91bmQobnJvdyhmZWF0dXJlX3RzbmUpKjAuNikpCgp0cmFpbnNldCA8LSBmZWF0dXJlX3RzbmUgJT4lIHNlbGVjdCgtZG9jX2lkKSAlPiUgc2xpY2Uoc2FtcGxlcykKdGVzdHNldCA8LSBmZWF0dXJlX3RzbmVbLXNhbXBsZXMsLTFdCgpgYGAKCgoKIyA5LiBTVk0KYGBge3J9CgpsaWJyYXJ5KGUxMDcxKQptb2RlbCA8LSBzdm0oc3RhdHVzfiAuLCBkYXRhID0gdHJhaW5zZXQsIGtlcm5lbD0icmFkaWFsIikKcGxvdChtb2RlbCwgdHJhaW5zZXQsIFYxflYyKQoKcHJlZGljdGluZyAgPC0gcHJlZGljdChtb2RlbCwgdGVzdHNldCAlPiUgc2VsZWN0KC1zdGF0dXMpKQoKIyBjcmVhdGluZyBjb25mdXNpb24gbWF0cml4CiMgaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvQ29uZnVzaW9uX21hdHJpeAp0YWJsZShwcmVkaWN0aW5nLCB0ZXN0c2V0JHN0YXR1cykKCiMgYWNjdXJhY3kKcHJlIDwtIHByZWRpY3RpbmcgPT0gdGVzdHNldCRzdGF0dXMKcGVyY2VudDEgPC0gbGVuZ3RoKHByZVtwcmUgPT0gVF0pIC8gbGVuZ3RoKHByZSkKcGVyY2VudDEKYGBgCg==